In [1]:
import nltk
import numpy as np
from string import punctuation

In [2]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [3]:
tagset = set([k[1] for k in nltk.corpus.treebank.tagged_words()])
print(tagset)

{'-LRB-', 'CD', ',', 'VBD', '$', 'RB', 'FW', 'WDT', "''", 'JJR', 'MD', 'POS', 'LS', 'VBN', 'NNS', 'WP', '#', 'IN', 'UH', 'RP', 'NN', '``', 'TO', '.', 'RBR', 'PRP$', 'JJ', 'PDT', 'NNPS', '-RRB-', 'VBG', 'RBS', 'CC', 'VBZ', 'PRP', 'WP$', 'SYM', 'JJS', 'VBP', 'EX', 'DT', 'VB', ':', '-NONE-', 'WRB', 'NNP'}


In [4]:
nltk.help.upenn_tagset("VBN")

VBN: verb, past participle
    multihulled dilapidated aerosolized chaired languished panelized used
    experimented flourished imitated reunifed factored condensed sheared
    unsettled primed dubbed desired ...


In [5]:
text = nltk.corpus.treebank.tagged_sents()
print(text[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [6]:
print(*[i[0] for i in text[0]])
print([i[1] for i in text[0]])

Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']


In [7]:
len(text)

3914

In [8]:
len(nltk.corpus.treebank.words())

100676

In [9]:
len(list(set(nltk.corpus.treebank.words())))

12408

In [10]:
len(set([i[1] for s in text for i in s]))

46

In [11]:
print(len(punctuation), punctuation)

32 !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


# Context Function

In [12]:
def consent(s):
    """coordinates represents, in the same order:
    preceding word (by 2 positions), 
    preceding word (by 1 position)
    central word
    1st following word
    2nd following word
    is it a number
    starts with capital
    is it a starting word
    is an ending word
    etc."""   
    contxt = {}
    l = len(s)
    for i, w in enumerate(s):
        contxt[(w[0],w[1],i)] = ((lambda x : s[i-2][0] if x>=2 else 0)(i), (lambda x : s[i-1][0] if x>=1 else 0)(i), 
         s[i][0], (lambda x : s[i+1][0] if x<l-1 else 0)(i), (lambda x : s[i+2][0] if x<l-2 else 0)(i),
                                 int(w[0].isdigit()), int(w[0][0].isupper()), 
         int((lambda x : x==0)(i)), int((lambda x : x==l-1)(i)))
    return contxt

In [13]:
print("Tag --->  Context", end="\n\n")
print(*[str(k[1])+str(" ---> ")+str(v) for k, v in consent(text[0]).items()], sep="\n")

Tag --->  Context

NNP ---> (0, 0, 'Pierre', 'Vinken', ',', 0, 1, 1, 0)
NNP ---> (0, 'Pierre', 'Vinken', ',', '61', 0, 1, 0, 0)
, ---> ('Pierre', 'Vinken', ',', '61', 'years', 0, 0, 0, 0)
CD ---> ('Vinken', ',', '61', 'years', 'old', 1, 0, 0, 0)
NNS ---> (',', '61', 'years', 'old', ',', 0, 0, 0, 0)
JJ ---> ('61', 'years', 'old', ',', 'will', 0, 0, 0, 0)
, ---> ('years', 'old', ',', 'will', 'join', 0, 0, 0, 0)
MD ---> ('old', ',', 'will', 'join', 'the', 0, 0, 0, 0)
VB ---> (',', 'will', 'join', 'the', 'board', 0, 0, 0, 0)
DT ---> ('will', 'join', 'the', 'board', 'as', 0, 0, 0, 0)
NN ---> ('join', 'the', 'board', 'as', 'a', 0, 0, 0, 0)
IN ---> ('the', 'board', 'as', 'a', 'nonexecutive', 0, 0, 0, 0)
DT ---> ('board', 'as', 'a', 'nonexecutive', 'director', 0, 0, 0, 0)
JJ ---> ('as', 'a', 'nonexecutive', 'director', 'Nov.', 0, 0, 0, 0)
NN ---> ('a', 'nonexecutive', 'director', 'Nov.', '29', 0, 0, 0, 0)
NNP ---> ('nonexecutive', 'director', 'Nov.', '29', '.', 0, 1, 0, 0)
CD ---> ('director',

## Choix des features

In [14]:
freq = 10 # filtre qui sélectionne les features apparaissant plus que freq fois

### Preceding word by two positions

In [15]:
tag_prec2 = [(k[1],v[0]) for s in text for k, v in consent(s).items()]
tag_prec2 = nltk.Text(tag_prec2)
tag_prec2_freq = nltk.FreqDist(tag_prec2)
selected_tag_prec2 = [couple for couple in tag_prec2_freq.keys() if tag_prec2_freq[couple]>5]
print("La nombre de paires (tag, word-2) apparaissant plus de 5 fois est: ", len(selected_tag_prec2))
selected_prec2 = list(set([w[1] for w in selected_tag_prec2]))
print("Le nombre de feature (preceding_word_by_2_positions) sélectionnés est: ", len(selected_prec2))
print("Exemple: ", selected_prec2[:5])

La nombre de paires (tag, word-2) apparaissant plus de 5 fois est:  1947
Le nombre de feature (preceding_word_by_2_positions) sélectionnés est:  603
Exemple:  [0, 'into', 'raise', 'first', 'effect']


### Preceding word by one position

In [16]:
tag_prec1 = [(k[1],v[1]) for s in text for k, v in consent(s).items()]
tag_prec1 = nltk.Text(tag_prec1)
tag_prec1_freq = nltk.FreqDist(tag_prec1)
selected_tag_prec1 = [couple for couple in tag_prec1_freq.keys() if tag_prec1_freq[couple]>5]
print("La nombre de paires (tag, word-1) apparaissant plus de 5 fois est: ", len(selected_tag_prec1))
selected_prec1 = list(set([w[1] for w in selected_tag_prec1]))
print("Le nombre de feature (preceding_word_by_1_positions) sélectionnés est: ", len(selected_prec1))
print("Exemple: ", selected_prec1[:5])

La nombre de paires (tag, word-1) apparaissant plus de 5 fois est:  2083
Le nombre de feature (preceding_word_by_1_positions) sélectionnés est:  999
Exemple:  [0, 'ringers', 'into', 'raise', 'first']


### Following word by one position

In [17]:
tag_follow1 = [(k[1],v[3]) for s in text for k, v in consent(s).items()]
tag_follow1 = nltk.Text(tag_follow1)
tag_follow1_freq = nltk.FreqDist(tag_follow1)
selected_tag_follow1 = [couple for couple in tag_follow1_freq.keys() if tag_follow1_freq[couple]>5]
print("La nombre de paires (tag, word+1) apparaissant plus de 5 fois est: ", len(selected_tag_follow1))
selected_follow1 = list(set([w[1] for w in selected_tag_follow1]))
print("Le nombre de feature (following_word_by_1_positions) sélectionnés est: ", len(selected_follow1))
print("Exemple: ", selected_follow1[:5])

La nombre de paires (tag, word+1) apparaissant plus de 5 fois est:  2119
Le nombre de feature (following_word_by_1_positions) sélectionnés est:  974
Exemple:  [0, 'ringers', 'hour', 'into', 'raise']


### Following word by two positions

In [18]:
tag_follow2 = [(k[1],v[4]) for s in text for k, v in consent(s).items()]
tag_follow2 = nltk.Text(tag_follow2)
tag_follow2_freq = nltk.FreqDist(tag_follow2)
selected_tag_follow2 = [couple for couple in tag_follow2_freq.keys() if tag_follow2_freq[couple]>5]
print("La nombre de paires (tag, word+2) apparaissant plus de 5 fois est: ", len(selected_tag_follow2))
selected_follow2 = list(set([w[1] for w in selected_tag_follow2]))
print("Le nombre de feature (following_word_by_2_positions) sélectionnés est: ", len(selected_follow2))
print("Exemple: ", selected_follow2[:5])

La nombre de paires (tag, word+2) apparaissant plus de 5 fois est:  1951
Le nombre de feature (following_word_by_2_positions) sélectionnés est:  600
Exemple:  [0, 'into', 'raise', 'first', 'taken']


### Central word

In [19]:
tag_w0 = [(k[1],v[2]) for s in text for k, v in consent(s).items()]
tag_w0 = nltk.Text(tag_w0)
tag_w0_freq = nltk.FreqDist(tag_w0)
selected_tag_w0 = [couple for couple in tag_w0_freq.keys() if tag_w0_freq[couple]>1]
print("La nombre de paires (tag, word) apparaissant plus de 5 fois est: ", len(selected_tag_w0))
selected_w0 = list(set([w[1] for w in selected_tag_w0]))
print("Le nombre de feature (central_word) sélectionnés est: ", len(selected_w0))
print("Exemple: ", selected_w0[:5])

La nombre de paires (tag, word) apparaissant plus de 5 fois est:  6332
Le nombre de feature (central_word) sélectionnés est:  5753
Exemple:  ['Estate', 'settled', '1\\/2', 'Serial', 'increasingly']


In [20]:
def context(s):
    """coordinates : the same order as for consent"""
    #if len(s)<=1:
    #    s = nltk.word_tokenize(s)
    contxt = {}
    l = len(s)
    for i, w in enumerate(s): #(w[1],i) >> we add i to ensure having repeated tags
        contxt[(w[1], i)] = ((lambda x : s[i-2][0] if x>=2 else 0)(i), (lambda x : s[i-1][0] if x>=1 else 0)(i), 
         s[i][0], (lambda x : s[i+1][0] if x<l-1 else 0)(i), (lambda x : s[i+2][0] if x<l-2 else 0)(i),
                                 int(w[0].isdigit()), int(w[0][0].isupper()), 
         int((lambda x : x==0)(i)), int((lambda x : x==l-1)(i)))
    return [(k[0],contxt[k]) for k in contxt.keys()]

In [21]:
context(text[0])

[('NNP', (0, 0, 'Pierre', 'Vinken', ',', 0, 1, 1, 0)),
 ('NNP', (0, 'Pierre', 'Vinken', ',', '61', 0, 1, 0, 0)),
 (',', ('Pierre', 'Vinken', ',', '61', 'years', 0, 0, 0, 0)),
 ('CD', ('Vinken', ',', '61', 'years', 'old', 1, 0, 0, 0)),
 ('NNS', (',', '61', 'years', 'old', ',', 0, 0, 0, 0)),
 ('JJ', ('61', 'years', 'old', ',', 'will', 0, 0, 0, 0)),
 (',', ('years', 'old', ',', 'will', 'join', 0, 0, 0, 0)),
 ('MD', ('old', ',', 'will', 'join', 'the', 0, 0, 0, 0)),
 ('VB', (',', 'will', 'join', 'the', 'board', 0, 0, 0, 0)),
 ('DT', ('will', 'join', 'the', 'board', 'as', 0, 0, 0, 0)),
 ('NN', ('join', 'the', 'board', 'as', 'a', 0, 0, 0, 0)),
 ('IN', ('the', 'board', 'as', 'a', 'nonexecutive', 0, 0, 0, 0)),
 ('DT', ('board', 'as', 'a', 'nonexecutive', 'director', 0, 0, 0, 0)),
 ('JJ', ('as', 'a', 'nonexecutive', 'director', 'Nov.', 0, 0, 0, 0)),
 ('NN', ('a', 'nonexecutive', 'director', 'Nov.', '29', 0, 0, 0, 0)),
 ('NNP', ('nonexecutive', 'director', 'Nov.', '29', '.', 0, 1, 0, 0)),
 ('CD',

In [22]:
new_dataset = []
for txt in text:
    new_dataset.extend(context(txt))
    
new_dataset[15:30]

[('NNP', ('nonexecutive', 'director', 'Nov.', '29', '.', 0, 1, 0, 0)),
 ('CD', ('director', 'Nov.', '29', '.', 0, 1, 0, 0, 0)),
 ('.', ('Nov.', '29', '.', 0, 0, 0, 0, 0, 1)),
 ('NNP', (0, 0, 'Mr.', 'Vinken', 'is', 0, 1, 1, 0)),
 ('NNP', (0, 'Mr.', 'Vinken', 'is', 'chairman', 0, 1, 0, 0)),
 ('VBZ', ('Mr.', 'Vinken', 'is', 'chairman', 'of', 0, 0, 0, 0)),
 ('NN', ('Vinken', 'is', 'chairman', 'of', 'Elsevier', 0, 0, 0, 0)),
 ('IN', ('is', 'chairman', 'of', 'Elsevier', 'N.V.', 0, 0, 0, 0)),
 ('NNP', ('chairman', 'of', 'Elsevier', 'N.V.', ',', 0, 1, 0, 0)),
 ('NNP', ('of', 'Elsevier', 'N.V.', ',', 'the', 0, 1, 0, 0)),
 (',', ('Elsevier', 'N.V.', ',', 'the', 'Dutch', 0, 0, 0, 0)),
 ('DT', ('N.V.', ',', 'the', 'Dutch', 'publishing', 0, 0, 0, 0)),
 ('NNP', (',', 'the', 'Dutch', 'publishing', 'group', 0, 1, 0, 0)),
 ('VBG', ('the', 'Dutch', 'publishing', 'group', '.', 0, 0, 0, 0)),
 ('NN', ('Dutch', 'publishing', 'group', '.', 0, 0, 0, 0, 0))]

In [23]:
# Feature vector block that looks to the preceding word by two positions
def f0(row):
    tmp = np.zeros(len(selected_prec2), dtype=np.int8)
    x = row[1][0]
    if x in selected_prec2:
        tmp[selected_prec2.index(x)]=1
    return tmp

In [24]:
# Feature vector block that looks to the preceding word by two positions
def f1(row):
    tmp = np.zeros(len(selected_prec1), dtype=np.int8)
    x = row[1][1]
    if x in selected_prec1:
        tmp[selected_prec1.index(x)]=1
    return tmp

In [25]:
# Feature vector block that looks to the central word
def f2(row):
    tmp = np.zeros(len(selected_w0), dtype=np.int8)
    x = row[1][2]
    if x in selected_w0:
        tmp[selected_w0.index(x)]=1
    return tmp

In [26]:
# Feature vector block that looks to the following word by one position
def f3(row):
    tmp = np.zeros(len(selected_follow1), dtype=np.int8)
    x = row[1][3]
    if x in selected_follow1:
        tmp[selected_follow1.index(x)]=1
    return tmp

In [27]:
# Feature vector block that looks to the following word by two positions
def f4(row):
    tmp = np.zeros(len(selected_follow2), dtype=np.int8)
    x = row[1][4]
    if x in selected_follow2:
        tmp[selected_follow2.index(x)]=1
    return tmp

In [28]:
def f(row):
    feature = np.concatenate((f0(row),f1(row), f2(row), f3(row), f4(row), row[1][-4:]))
    return feature

### A small test to control

In [29]:
f(new_dataset[0])

array([1, 0, 0, ..., 1, 1, 0])

In [30]:
# The number of activated features for the first 35 entries (t,x)
print([sum(f(i)) for i in new_dataset[:35]])

[5, 4, 2, 5, 4, 4, 4, 3, 5, 4, 4, 4, 3, 5, 3, 4, 6, 5, 6, 6, 4, 3, 3, 4, 5, 2, 2, 5, 4, 4, 5, 6, 3, 3, 5]


In [31]:
print("The number of activated features is: ", sum(f(new_dataset[33])), " for the pair below.")
print('The tag is: "{}"'.format(new_dataset[33][0]), " and the corresponding context is: ", new_dataset[33][1])

The number of activated features is:  3  for the pair below.
The tag is: ","  and the corresponding context is:  ('Rudolph', 'Agnew', ',', '55', 'years', 0, 0, 0, 0)


In [32]:
teeest = list(map(lambda x : (x[0],sum(f(x))), new_dataset[:10]))
teeest

[('NNP', 5),
 ('NNP', 4),
 (',', 2),
 ('CD', 5),
 ('NNS', 4),
 ('JJ', 4),
 (',', 4),
 ('MD', 3),
 ('VB', 5),
 ('DT', 4)]

In [33]:
len(f(new_dataset[0]))

8933

In [35]:
len(new_dataset)

100676

In [36]:
dataset = map(lambda x : (x[0],f(x)), new_dataset)

In [None]:
dataset = list(dataset)

In [10]:
##############################################""" Context """###################################
text = nltk.corpus.treebank.tagged_sents()
def consent(s):
    """coordinates represents, in the same order:
    preceding word (by 2 positions), 
    preceding word (by 1 position)
    central word
    1st following word
    2nd following word
    is it a number
    starts with capital
    is it a starting word
    is an ending word
    etc."""    
    contxt = {}
    # First word :
    contxt[(s[0][0],s[0][1],1)] = (0, 0, s[0][0], s[1][0],s[2][0],s[0][0].isdigit(), s[0][0][0].isupper(), 1, 0)
    # Second word :
    contxt[(s[1][0],s[1][1],2)] = (0, s[0][0], s[1][0], s[2][0], s[3][0], s[1][0].isdigit(), s[1][0][0].isupper(), 0, 0)
    # For middle words
    for i, w in enumerate(s[2:-2]) :
        contxt[(w[0],w[1],i+3)] = (s[i][0], s[i+1][0], s[i+2][0], s[i+3][0], s[i+4][0], w[0].isdigit(), w[0][0].isupper(), 0, 0)
    # The word before last - the penultimate one - the second word to last
    contxt[(s[-2][0],s[-2][1],len(s)-1)] = (s[-4][0], s[-3][0], s[-2][0], s[-1][0], 0, s[-2][0].isdigit(), s[-2][0][0].isupper(),0, 0)
    # The last word   
    contxt[(s[-1][0],s[-1][1],len(s))] = (s[-3][0], s[-2][0], s[-1][0], 0, 0, s[-1][0].isdigit(), s[-1][0][0].isupper(), 0, 1)
    return ([(k,v) for k,v in contxt.items()])

def context(text):
    txt = []
    for s in text :
        txt.extend(consent(s))
    return txt

In [11]:
context(text[:10])
[(k[0][1],k[1]) for k in context(text[650:1000]) if k[0][1]=="NNP"][:20]

[('NNP', ('0', '*T*-2', 'Richard', 'Drobnick', ',', False, True, 0, 0)),
 ('NNP', ('*T*-2', 'Richard', 'Drobnick', ',', 'director', False, True, 0, 0)),
 ('NNP', ('at', 'the', 'University', 'of', 'Southern', False, True, 0, 0)),
 ('NNP',
  ('University', 'of', 'Southern', 'California', "'s", False, True, 0, 0)),
 ('NNP',
  ('of', 'Southern', 'California', "'s", 'Graduate', False, True, 0, 0)),
 ('NNP', ('California', "'s", 'Graduate', 'School', 'of', False, True, 0, 0)),
 ('NNP', ("'s", 'Graduate', 'School', 'of', 'Business', False, True, 0, 0)),
 ('NNP', ('School', 'of', 'Business', '.', 0, False, True, 0, 0)),
 ('NNP', ('*-1', 'turn', 'Southeast', 'Asia', 'into', False, True, 0, 0)),
 ('NNP', ('turn', 'Southeast', 'Asia', 'into', 'a', False, True, 0, 0)),
 ('NNP',
  ('parallel', 'the', 'European', 'Common', 'Market', False, True, 0, 0)),
 ('NNP',
  ('the', 'European', 'Common', 'Market', 'approach', False, True, 0, 0)),
 ('NNP', ('European', 'Common', 'Market', 'approach', '.', False

In [12]:
freq = 10
def preced2(tagged_sents):
    prec2 = nltk.defaultdict(int)
    precedent2 = nltk.defaultdict(list)
    for t in tagset:
        for i in [k[1] for k in context(tagged_sents) if k[0][1]==t]:
            prec2[i[0]] += 1
        for k,v in prec2.items() :
            if v > freq : precedent2[t].append(k)
        
    return [(k,v) for k,v in precedent2.items()]

In [13]:
preced2(text[:100])

[('JJ', [0]),
 ('FW', [0]),
 ('WDT', [0]),
 ('VBD', [0, 'the', ',']),
 ('VBN', [0, 'the', ',']),
 ('UH', [0, 'the', ',']),
 ('VB', [0, 'the', ',']),
 ('CC', [0, 'the', ',']),
 ('PRP', [0, '*', 'the', ',']),
 ('SYM', [0, '*', 'the', ',']),
 ('NNS', ['of', 0, 'to', 'and', 'in', '*', 'the', ',']),
 ('VBP', ['of', 0, 'to', 'and', 'in', '*', 'the', ',']),
 ('``', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'the', ',']),
 ('RP', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'the', ',']),
 (',', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'for', 'the', ',']),
 ('NNPS', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'for', 'the', ',']),
 ('WP', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'for', 'the', ',']),
 ('VBG', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'for', 'the', ',']),
 ('-RRB-', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'for', 'the', ',']),
 ('#', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'for', 'the', ',']),
 ('VBZ', ['*-1', 'of', 0, 'to', 'and', 'in', '*', 'for', 'the', 'that', ',']),
 ('EX', ['*-1', 'o

# the previous words from two positions

In [14]:
p2 = []
for i in [v for k,v in preced2([s for s in text if len(s)>5])]:
    p2.extend(i)
    
p2 = list(set(p2))
p2

[0,
 'Bank',
 'Judge',
 'study',
 'know',
 'minimum',
 'put',
 'package',
 'not',
 '13',
 'volatility',
 'rate',
 '*RNR*-1',
 'And',
 'Johnson',
 'Lane',
 'previous',
 'gains',
 'Union',
 'bad',
 'Herald',
 'Sea',
 'latest',
 'It',
 'plans',
 '-RCB-',
 'unit',
 'case',
 'Many',
 'has',
 'services',
 'across',
 'legislation',
 'wanted',
 'Simmons',
 'known',
 'age',
 'failed',
 'decided',
 'officer',
 'chairman',
 'Phelan',
 'due',
 'far',
 'risks',
 'operations',
 'insurance',
 'authority',
 'percentage',
 '1989',
 'development',
 'Despite',
 'designed',
 'sign',
 'free',
 'Angeles',
 '*-2',
 'ca',
 'By',
 'series',
 'two',
 'junk',
 'Big',
 'marketing',
 'leading',
 'Cray',
 'closed',
 'second',
 'old',
 'enforcement',
 'months',
 'increase',
 'owns',
 'advanced',
 'Wedtech',
 'Such',
 'Scoring',
 'past',
 'Ltd.',
 '11',
 'include',
 'what',
 'Random',
 'increased',
 '5,000',
 'position',
 'publishing',
 'meet',
 'face',
 'their',
 'least',
 'period',
 'There',
 'month',
 'Ford',
 'ma

In [15]:
def preced1(tagged_sents):
    prec1 = nltk.defaultdict(int)
    precedent1 = nltk.defaultdict(list)
    for t in tagset:
        for i in [k[1] for k in context(tagged_sents) if k[0][1]==t]:
            prec1[i[1]] += 1
        for k,v in prec1.items() :
            if v > freq : precedent1[t].append(k)
        
    return [(k,v) for k,v in precedent1.items()]

# Previous words from one position

In [16]:
p1 = []
for i in [v for k,v in preced1([s for s in text if len(s)>5])]:
    p1.extend(i)
    
p1 = list(set(p1))
p1

[0,
 'Bank',
 'Judge',
 'study',
 'know',
 'minimum',
 'conference',
 'put',
 'package',
 'not',
 '13',
 'volatility',
 '*RNR*-1',
 'rate',
 'And',
 'Johnson',
 'Corp',
 'Lane',
 'previous',
 'gains',
 'Union',
 'bad',
 'Sea',
 'latest',
 'It',
 '-RCB-',
 'unit',
 'case',
 'Many',
 'has',
 'services',
 'across',
 'legislation',
 'wanted',
 'Simmons',
 'known',
 'age',
 'failed',
 'decided',
 'win',
 'officer',
 'chairman',
 'Phelan',
 'due',
 'far',
 'risks',
 'operations',
 'insurance',
 'authority',
 'percentage',
 '1989',
 'development',
 'Despite',
 'designed',
 'bottle',
 'sign',
 'free',
 'Angeles',
 'publicly',
 '*-2',
 'ca',
 'rather',
 'series',
 'two',
 'junk',
 'Big',
 'By',
 'marketing',
 'leading',
 'second',
 'closed',
 'Cray',
 'old',
 'enforcement',
 'months',
 'increase',
 'owns',
 'Constitution',
 'Wedtech',
 'advanced',
 'Scoring',
 'Such',
 'past',
 'Ltd.',
 '11',
 'Random',
 'what',
 'include',
 'increased',
 '5,000',
 'overseas',
 'member',
 'position',
 'publishi

In [17]:
def current(tagged_sents):
    cur = nltk.defaultdict(int)
    curr = nltk.defaultdict(list)
    for t in tagset:
        for i in [k[1] for k in context(tagged_sents) if k[0][1]==t]:
            cur[i[2]] += 1
        for k,v in cur.items() :
            if v > freq : curr[t].append(k)
        
    return [(k,v) for k,v in curr.items()]

# Current words

In [18]:
c = []
for i in [v for k,v in current([s for s in text if len(s)>5])]:
    c.extend(i)
    
c = list(set(c))
c

['study',
 'Bank',
 'Judge',
 'know',
 'minimum',
 'conference',
 'put',
 'package',
 'not',
 '13',
 'volatility',
 'rate',
 '*RNR*-1',
 'And',
 'Johnson',
 'Corp',
 'Lane',
 'previous',
 'gains',
 'Union',
 'bad',
 'Herald',
 '-RCB-',
 'Sea',
 'It',
 'latest',
 'unit',
 'case',
 'Many',
 'services',
 'has',
 'across',
 'wanted',
 'legislation',
 'Simmons',
 'known',
 'age',
 'failed',
 'decided',
 'win',
 'officer',
 'chairman',
 'Phelan',
 'due',
 'far',
 'risks',
 'operations',
 'authority',
 'percentage',
 'insurance',
 '1989',
 'development',
 'Despite',
 'designed',
 'bottle',
 'sign',
 'free',
 'publicly',
 'Angeles',
 'series',
 'ca',
 '*-2',
 'By',
 'two',
 'junk',
 'Big',
 'marketing',
 'leading',
 'second',
 'closed',
 'Cray',
 'old',
 'enforcement',
 'increase',
 'months',
 'advanced',
 'owns',
 'Constitution',
 'Such',
 'Scoring',
 'Wedtech',
 'past',
 'Ltd.',
 '11',
 'include',
 'what',
 'Random',
 'increased',
 'overseas',
 '5,000',
 'member',
 'position',
 'publishing',

In [19]:
def follow1(tagged_sents):
    fol = nltk.defaultdict(int)
    foll = nltk.defaultdict(list)
    for t in tagset:
        for i in [k[1] for k in context(tagged_sents) if k[0][1]==t]:
            fol[i[3]] += 1
        for k,v in fol.items() :
            if v > freq : foll[t].append(k)
        
    return [(k,v) for k,v in foll.items()]

f1 = []
for i in [v for k,v in follow1([s for s in text if len(s)>5])]:
    f1.extend(i)
    
f1 = list(set(f1))
f1

[0,
 'Bank',
 'Judge',
 'study',
 'know',
 'minimum',
 'conference',
 'put',
 'package',
 'not',
 '13',
 'volatility',
 'rate',
 '*RNR*-1',
 'Johnson',
 'Corp',
 'Lane',
 'previous',
 'gains',
 'Union',
 'bad',
 'Herald',
 '-RCB-',
 'Sea',
 'It',
 'latest',
 'unit',
 'case',
 'has',
 'services',
 'across',
 'legislation',
 'wanted',
 'Simmons',
 'known',
 'age',
 'failed',
 'decided',
 'win',
 'officer',
 'chairman',
 'Phelan',
 'due',
 'far',
 'risks',
 'operations',
 'insurance',
 'authority',
 'percentage',
 '1989',
 'development',
 'designed',
 'bottle',
 'sign',
 'free',
 'Angeles',
 'publicly',
 '*-2',
 'ca',
 'series',
 'two',
 'junk',
 'Big',
 'marketing',
 'leading',
 'closed',
 'Cray',
 'second',
 'old',
 'enforcement',
 'months',
 'increase',
 'owns',
 'advanced',
 'Constitution',
 'Scoring',
 'past',
 'Ltd.',
 '11',
 'include',
 'what',
 'Random',
 'increased',
 '5,000',
 'overseas',
 'member',
 'position',
 'publishing',
 'meet',
 'face',
 'their',
 'least',
 'period',
 'm

In [20]:
def follow2(tagged_sents):
    fol = nltk.defaultdict(int)
    foll = nltk.defaultdict(list)
    for t in tagset:
        for i in [k[1] for k in context(tagged_sents) if k[0][1]==t]:
            fol[i[4]] += 1
        for k,v in fol.items() :
            if v > freq : foll[t].append(k)
        
    return [(k,v) for k,v in foll.items()]

f2 = []
for i in [v for k,v in follow2([s for s in text if len(s)>5])]:
    f2.extend(i)
    
f2 = list(set(f2))
f2

[0,
 'Bank',
 'Judge',
 'know',
 'minimum',
 'conference',
 'put',
 'package',
 'not',
 '13',
 'volatility',
 '*RNR*-1',
 'rate',
 'Johnson',
 'Corp',
 'previous',
 'gains',
 'Union',
 'bad',
 'Sea',
 '-RCB-',
 'latest',
 'unit',
 'case',
 'has',
 'services',
 'across',
 'legislation',
 'wanted',
 'known',
 'age',
 'failed',
 'decided',
 'win',
 'officer',
 'chairman',
 'Phelan',
 'due',
 'far',
 'risks',
 'operations',
 'insurance',
 'authority',
 '1989',
 'development',
 'designed',
 'bottle',
 'sign',
 'free',
 'Angeles',
 'publicly',
 '*-2',
 'ca',
 'two',
 'junk',
 'Big',
 'marketing',
 'leading',
 'Cray',
 'closed',
 'second',
 'old',
 'enforcement',
 'months',
 'increase',
 'owns',
 'past',
 'Ltd.',
 'Random',
 'include',
 'what',
 '11',
 'increased',
 'overseas',
 '5,000',
 'member',
 'position',
 'publishing',
 'meet',
 'face',
 'their',
 'least',
 'Ford',
 'month',
 'makes',
 'period',
 '18',
 'goes',
 'Trade',
 'lower',
 'shareholder',
 'sale',
 'year',
 'convertible',
 'mak

In [21]:
##############################################""" Context """###################################
text = nltk.corpus.treebank.tagged_sents()
def consent2(s):
    """coordinates represents, in the same order:
    preceding word (by 2 positions), 
    preceding word (by 1 position)
    central word
    1st following word
    2nd following word
    is it a number
    starts with capital
    is it a starting word
    is an ending word
    etc."""    
    s = nltk.word_tokenize(s)
    contxt = {}
    # First word :
    contxt[(s[0],1)] = (0, 0, s[0], s[1],s[2],s[0].isdigit(), s[0][0].isupper(), 1, 0)
    # Second word :
    contxt[(s[1],2)] = (0, s[0], s[1], s[2], s[3], s[1].isdigit(), s[1][0].isupper(), 0, 0)
    # For middle words
    for i, w in enumerate(s[2:-2]) :
        contxt[(w,i+3)] = (s[i], s[i+1], s[i+2], s[i+3], s[i+4], w.isdigit(), w[0].isupper(), 0, 0)
    # The word before last - the penultimate one - the second word to last
    contxt[(s[-2],len(s)-1)] = (s[-4], s[-3], s[-2], s[-1], 0, s[-2].isdigit(), s[-2][0].isupper(),0, 0)
    # The last word   
    contxt[(s[-1],len(s))] = (s[-3], s[-2], s[-1], 0, 0, s[-1].isdigit(), s[-1][0].isupper(), 0, 1)
    return ([(k[0],v) for k,v in contxt.items()])

def context2(text):
    txt = []
    for s in nltk.sent_tokenize(text) :
        txt.extend(consent2(s))
    return txt

In [22]:
context2("Yes I agree with you my best friend. I'm sorry for all this problems.")

[('Yes', (0, 0, 'Yes', 'I', 'agree', False, True, 1, 0)),
 ('I', (0, 'Yes', 'I', 'agree', 'with', False, True, 0, 0)),
 ('agree', ('Yes', 'I', 'agree', 'with', 'you', False, False, 0, 0)),
 ('with', ('I', 'agree', 'with', 'you', 'my', False, False, 0, 0)),
 ('you', ('agree', 'with', 'you', 'my', 'best', False, False, 0, 0)),
 ('my', ('with', 'you', 'my', 'best', 'friend', False, False, 0, 0)),
 ('best', ('you', 'my', 'best', 'friend', '.', False, False, 0, 0)),
 ('friend', ('my', 'best', 'friend', '.', 0, False, False, 0, 0)),
 ('.', ('best', 'friend', '.', 0, 0, False, False, 0, 1)),
 ('I', (0, 0, 'I', "'m", 'sorry', False, True, 1, 0)),
 ("'m", (0, 'I', "'m", 'sorry', 'for', False, False, 0, 0)),
 ('sorry', ('I', "'m", 'sorry', 'for', 'all', False, False, 0, 0)),
 ('for', ("'m", 'sorry', 'for', 'all', 'this', False, False, 0, 0)),
 ('all', ('sorry', 'for', 'all', 'this', 'problems', False, False, 0, 0)),
 ('this', ('for', 'all', 'this', 'problems', '.', False, False, 0, 0)),
 ('probl

In [92]:
import numpy as np
def f(phrase):
    phrase = context2(phrase)
    sent = nltk.defaultdict(list)
    for w in phrase:
        x = []
        for w_0 in c:
            x.append(int(w[1][2]==w_0))
        for w_1 in p1:
            x.append(int(w[1][1]==w_1))
        sent[w[0]].extend(x)
    return [(k,v) for k,v in sent.items()]
            
print(f("Yes I agree with you my best friend. I'm sorry for all this problems."))

[('Yes', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [93]:
print([(k,np.sum(v)) for k,v in f("Yes I agree with you my best friend. I'm sorry for all this problems.")])

[('Yes', 1), ('I', 3), ('agree', 1), ('with', 1), ('you', 2), ('my', 2), ('best', 2), ('friend', 1), ('.', 3), ("'m", 2), ('sorry', 1), ('for', 1), ('all', 2), ('this', 2), ('problems', 2)]


In [95]:
sss = "Yes I agree with you my best friend. I'm sorry for all this problems."
ttt = f(sss)
tttt = c.copy()
tttt.extend(p1)
for k in ttt:
    print(tttt[k[1].index(1)])

0
I
I
with
you
my
best
best
.
'm
'm
for
all
this
problems


In [32]:
current = nltk.defaultdict(int)
precedent = nltk.defaultdict(int)
sent_begin = nltk.defaultdict(int)

for i in [k[1] for k in context(text[:100]) if k[0][1]=="NNP"]:
    current[i[2]] += 1
    precedent[i[1]] += 1

In [33]:
current

defaultdict(int,
            {'Pierre': 1,
             'Vinken': 2,
             'nonexecutive': 1,
             'Mr.': 1,
             'chairman': 3,
             'of': 13,
             ',': 9,
             'Rudolph': 1,
             'Agnew': 1,
             'Consolidated': 1,
             'Gold': 1,
             'to': 4,
             'Lorillard': 4,
             'Inc.': 2,
             'New': 2,
             'York-based': 1,
             '*T*-2': 1,
             'today': 1,
             "'s": 4,
             'Journal': 1,
             'said': 4,
             '*T*-1': 3,
             'James': 1,
             'Talcott': 3,
             'Boston': 1,
             'Institute': 1,
             'Dr.': 1,
             'from': 1,
             'the': 10,
             'National': 2,
             'schools': 1,
             'University': 3,
             '9.8': 1,
             'at': 3,
             'Groton': 1,
             '*-4': 1,
             'Hollingsworth': 1,
             '&': 1,
         

In [None]:
current = nltk.defaultdict(int)
precedent = nltk.defaultdict(int)
sent_begin = nltk.defaultdict(int)

for i in [k[1] for k in context3(text[:100]) if k[0][1]=="NNP"]:
    current[i[1]] += 1
    precedent[i[0]] += 1
    sent_begin[str(i[9])] += 1
    
""" Pour tout t:
1) Construire:
preced1 = nltk.defaultdict
..
follow2 = nltk.defaultdict

Garder les key pour lesquelles value > 3  --> exemple : prec["the"]=18

                                    -------------------------------

2) On les stock dans un vecteur, tout en respectant l'ordre (prec2¹,prec2²,...prec2⁹, prec1¹,...,current¹.. )
Remarque prec2¹ = précédé par "the" ; alors on pourra le noter "the_2"
         follow2⁵ = suivi par "the" ; alors on pourra le noter "the+2"

3) définir la fonction f_t, qui pour chaque couple (t,x) associe un np avec des 0 ou 1 selon si
la condition est vérifiée. --> exemple 't' est-il précédé par 'w' (si oui 1) et (0 sinon)

4) On définit la fonction vectorielle f dont les coordonnées sont f_t 

!! En prenant soin de supprimer les redondances -> Par exemple si f_NNP contient "précédé par 'the'"
et f_ADJ contient "précédé par 'the'" aussi, alors on en garde que 1.

-------------------------------------------------------------------------------------------------------

Autre méthode:
1) Se concentrer sur prec1 d'abord (pour tout t) afin d'obtenir au final un seul grand vecteur 
contenant tous les mots précédant les tags

"""

In [104]:
##############################################""" Context """###################################
text = nltk.corpus.treebank.tagged_sents()
def con_sent(s):
    """coordinates represents, in the same order:
    preceding word (by 2 positions), 
    preceding word (by 1 position)
    central word
    1st following word
    2nd following word
    is it a number
    starts with capital
    is it a starting word
    is an ending word
    etc."""    
    if len(s)>=5 :
        contxt = {}
        # First word :
        contxt[(s[0][0],s[0][1],1)] = (0, 0, s[0][0], s[1][0],s[2][0],s[0][0].isdigit(), s[0][0][0].isupper(), 1, 0)
        # Second word :
        contxt[(s[1][0],s[1][1],2)] = (0, s[0][0], s[1][0], s[2][0], s[3][0], s[1][0].isdigit(), s[1][0][0].isupper(), 0, 0)
        # For middle words
        for i, w in enumerate(s[2:-2]) :
            contxt[(w[0],w[1],i+3)] = (s[i][0], s[i+1][0], s[i+2][0], s[i+3][0], s[i+4][0], w[0].isdigit(), w[0][0].isupper(), 0, 0)
        # The word before last - the penultimate one - the second word to last
        contxt[(s[-2][0],s[-2][1],len(s)-1)] = (s[-4][0], s[-3][0], s[-2][0], s[-1][0], 0, s[-2][0].isdigit(), s[-2][0][0].isupper(),0, 0)
        # The last word   
        contxt[(s[-1][0],s[-1][1],len(s))] = (s[-3][0], s[-2][0], s[-1][0], 0, 0, s[-1][0].isdigit(), s[-1][0][0].isupper(), 0, 1)
    
    elif len(s)==4:
        contxt = {}
        # First word :
        contxt[(s[0][0],s[0][1],1)] = (0, 0, s[0][0], s[1][0],s[2][0],s[0][0].isdigit(), s[0][0][0].isupper(), 1, 0)
        # Second word :
        contxt[(s[1][0],s[1][1],2)] = (0, s[0][0], s[1][0], s[2][0], s[3][0], s[1][0].isdigit(), s[1][0][0].isupper(), 0, 0)
        # The word before last - the penultimate one - the second word to last
        contxt[(s[-2][0],s[-2][1],len(s)-1)] = (s[-4][0], s[-3][0], s[-2][0], s[-1][0], 0, s[-2][0].isdigit(), s[-2][0][0].isupper(),0, 0)
        # The last word   
        contxt[(s[-1][0],s[-1][1],len(s))] = (s[-3][0], s[-2][0], s[-1][0], 0, 0, s[-1][0].isdigit(), s[-1][0][0].isupper(), 0, 1)        
    
    elif len(s)==3:
        contxt = {}
        # First word :
        contxt[(s[0][0],s[0][1],1)] = (0, 0, s[0][0], s[1][0],s[2][0],s[0][0].isdigit(), s[0][0][0].isupper(), 1, 0)
        # Second word :
        contxt[(s[1][0],s[1][1],2)] = (0, s[0][0], s[1][0], s[2][0], 0, s[1][0].isdigit(), s[1][0][0].isupper(), 0, 0)
        # The last word   
        contxt[(s[-1][0],s[-1][1],len(s))] = (s[-3][0], s[-2][0], s[-1][0], 0, 0, s[-1][0].isdigit(), s[-1][0][0].isupper(), 0, 1)        
    
    elif len(s)==2:
        contxt = {}
        # First word :
        contxt[(s[0][0],s[0][1],1)] = (0, 0, s[0][0], s[1][0],0,s[0][0].isdigit(), s[0][0][0].isupper(), 1, 0)
        # The last word   
        contxt[(s[-1][0],s[-1][1],len(s))] = (0, s[0][0], s[1][0], 0, 0, s[-1][0].isdigit(), s[-1][0][0].isupper(), 0, 1)        
           
    else: contxt={}; contxt[(s[0][0],s[0][1],1)] = (0, 0, s[0][0], 0,0,s[0][0].isdigit(), s[0][0][0].isupper(), 1, 1)
    return ([(k,v) for k,v in contxt.items()])

def con_text(text):
    txt = []
    for s in text :
        txt.extend(con_sent(s))
    return txt

In [108]:
con_text(text)

[(('Pierre', 'NNP', 1), (0, 0, 'Pierre', 'Vinken', ',', False, True, 1, 0)),
 (('Vinken', 'NNP', 2), (0, 'Pierre', 'Vinken', ',', '61', False, True, 0, 0)),
 ((',', ',', 3), ('Pierre', 'Vinken', ',', '61', 'years', False, False, 0, 0)),
 (('61', 'CD', 4), ('Vinken', ',', '61', 'years', 'old', True, False, 0, 0)),
 (('years', 'NNS', 5), (',', '61', 'years', 'old', ',', False, False, 0, 0)),
 (('old', 'JJ', 6), ('61', 'years', 'old', ',', 'will', False, False, 0, 0)),
 ((',', ',', 7), ('years', 'old', ',', 'will', 'join', False, False, 0, 0)),
 (('will', 'MD', 8), ('old', ',', 'will', 'join', 'the', False, False, 0, 0)),
 (('join', 'VB', 9),
  (',', 'will', 'join', 'the', 'board', False, False, 0, 0)),
 (('the', 'DT', 10),
  ('will', 'join', 'the', 'board', 'as', False, False, 0, 0)),
 (('board', 'NN', 11),
  ('join', 'the', 'board', 'as', 'a', False, False, 0, 0)),
 (('as', 'IN', 12),
  ('the', 'board', 'as', 'a', 'nonexecutive', False, False, 0, 0)),
 (('a', 'DT', 13),
  ('board', 'as'